********************************************************************************
* Master file for running full code for AKM and PageRanks.
*
* Original code based on Morchio and Moser (2024) NBER WP#32408
********************************************************************************
********************************************************************************
* INITIAL HOUSEKEEPING
********************************************************************************

timer clear 1
timer on 1

global sample = 0 // 0 = full raw data files; 1 = sample; 2 = mini-sample

global year_min = 2009 // first year of analysis
global year_max = 2016 // last year of analysis

global VALUE_1_CONNECTED = 1 // find connected set
global VALUE_2_AKM = 1 // estimate AKM wage equation
global VALUE_3_PAGERANKS = 1 // compute PageRanks


********************************************************************************
* AUTOMATED STEPS
********************************************************************************
* call user-defined function to define time stamp file name extension
do "${DO_DIR}/FUN_EXTENSION.do"

* set processors
set processors `=min(${n_cpus}, `c(processors_max)')'

* define sample prefix
if $sample == 0 global sample_prefix = ""

* start log file
global log_name = "log_VALUE_${year_min}_${year_max}_${ext}"
log using "${LOG_DIR}/${log_name}.log", text name(value_master) replace


********************************************************************************
* SET PARAMETERS
********************************************************************************
* list of variables to load
global empid_var = "empid_est" // "empid_est" = establishment level; "empid_firm" = firm level
global income_var = "earn_mean_mw" // earn_mean_mw, earn_dec_mw, earn_mean, earn_dec, or earn_contract
global categ_var = "gender" // gender or hschool
global vars_list = "year persid gender hschool edu age ${empid_var} ind07_5 muni hours hours_year occ02_6 tenure exp_act ${income_var}"

* sample selection parameters
global persid_min = 1
global persid_max = 10^11
global ${empid_var}_min = 1
global ${empid_var}_max = 10^14
global gender_min = 1 // 1 = male; 2 = female; . = missing
global gender_max = 2
global hschool_min = 1 // 1 = no completed high school; 2 = completed high school; . = missing
global hschool_max = 2
global race_min = . // 1 = Indigenous; 2 = White; 3 = Black; 4 = Asian; 5 = Brown; . = missing
global race_max = .
global edu_min = 1 // 1 = illiterate; 2 = some primary; 3 = primary; 4 = some middle; 5 = middle; 6 = some high; 7 = high; 8 = some college; 9 = Bachelor's; 10 = Masters ; 11 = PhD; . = missing
global edu_max = 11
global age_min = 18 // 1-99 = age in years; . = missing
global age_max = 54
global ind07_5_min = 0
global ind07_5_max = 99999
global occ02_6_min = 0
global occ02_6_max = 999999
global hours_min = 1
global hours_max = 7*24
global hours_year_min = 1
global hours_year_max = 365*24
global muni_min = 1
global muni_max = 999999
global earn_mean_mw_min = 1
global earn_mean_mw_max = 10^3
global earn_dec_mw_min = 1
global earn_dec_mw_max = 10^3
global earn_mean_min = 0
global earn_mean_max = 788*10^3
global earn_dec_min = 0
global earn_dec_max = 788*10^3
global earn_contract_min = 0
global earn_contract_max = 788*10^3

* parameters for defining connected set
global connected_default = 1 // 0 = save connected set with random file name extension; 1 = save connected set without extension, i.e., make it the default file to use.
global connect_by_categ = 1 // 0 = create connected set for populuation; 1= create connected set separately by ${categ_var}.
global connect_strong = 1 // 0 = create weakly connected set; 1 = create strongly connected set.
global drop_singletons = 1 // 0 = keep all observations; 1 = recursively drop singletons by employer ID and worker ID.
global min_size_emp = 10 // minimum employment threshold (>=1).
global min_size_emp_years = 2 // 0 = impose minimum employer threshold (${min_size_emp}) across pooled years; 1 = impose minimum employer threshold (${min_size_emp}) on average in each year across total timespan; 2 = impose minimum employer threshold (${min_size_emp}) on average in each year across years that firm exists; 3 = impose minimum employer threshold (${min_size_emp}) in each year.
global min_size_emp_nonsingletons = 1 // 0 = apply minimum employer threshold w.r.t. all workers; 1 = apply minimum employer threshold w.r.t. nonsingleton workers (i.e., workers that are observed at least one more time at a future date -- see Sorkin ('18 QJE).
global min_emp_years = 3 // minimum number of employer-years per employer in the sample (>=1).
global min_hire_UE = 1 // minimum number of hires from nonemployment, i.e., from outside of RAIS (>=0) -- Bagger & Lentz (REStud '19) and Sorkin (QJE '18) set = 1.
global min_hire_tot = 10 // minimum number of hires (>=0) -- Bagger & Lentz (REStud '19) set = 15, Sorkin (QJE '18) sets = 0.
global connected_pagerank_monthly = 1 // 0 = find connected set for PageRank estimation using annual data without selection criteria; 1 = find connected set for PageRank estimation using monthly data without selection criteria

* AKM estimation parameters
global akm_default = 1 // 0 = save AKM estimates with random file name extension; 1 = save AKM estimates without extension, i.e., make it the default file to use.
global akm_by_categ = 1 // 0 = run AKM on pooled data; 1 = run AKM separately by ${categ_var}
global akm_hourly_wage = 1 // 0 = run AKM on monthly wage; 1 = run AKM on hourly wage (make sure to set $akm_hours = 0)
global age_poly_order = 3 // 0 = do not include age terms in AKM regression; 1 = age dummies with income-age profile restricted to be flat from age ${age_flat_min}-${age_flat_max}; 2 / 3 / etc. = include 2nd order term / 2nd and 3rd order terms / etc.
global age_flat_min =35 // minimum age for which income-age profile is restricted to be flat (only relevant if age_poly_order == 1)
global age_flat_max = 40 // maximum age for which income-age profile is restricted to be flat (only relevant if age_poly_order == 1)
global age_norm = 40 // age around which to normalize higher-order age polynomial terms in AKM estimation (relevant only if ${age_poly_order} >= 2; coincides with where the age profile is assumed to be flat for interpretation of worker FEs and year FEs)
global edu_inter = 1 // 0 = no education interactions; 1 = interact education with time trends and age profiles
global akm_hours = 0 // 0 = do not include hours controls in AKM estimation; 1 = include hours controls in AKM estimation
global akm_occ = 0 // 0 = do not include occupation controls in AKM estimation; 1 = include occupation controls in AKM estimation
global akm_tenure = 0 // 0 = do not include tenure controls in AKM estimation; 1 = include tenure controls in AKM estimation
global akm_exp_act = 0 // 0 = do not include actual experience controls in AKM estimation; 1 = include actual experience controls in AKM estimation
global akm_coarsen = 0 // 0 = leave all variables as originally coded; 1 = coarsen variables before AKM estimation
global N_coarsen = 15 // minimum number of observations used for coarsening categories of each AKM independent variable
global akm_norm_ind_list = "56112" // space-separated list of industries where firm FEs are normalized to zero (e.g., "56112" or "56121")

* PageRank estimation parameters
global pagerank_default = 1 // 0 = save PageRank estimates with random file name extension; 1 = save PageRank estimates without extension, i.e., make it the default file to use.
global pagerank_monthly = 1 // 0 = compute PageRanks using annual data without selection criteria; 1 = compute PageRanks using monthly data without selection criteria
global pagerank_by_categ = 1 // 0 = compute PageRank on pooled data; 1 = compute PageRank separately by ${categ_var}
global pagerank_self = 0 // count self-nodes? 0 = no; 1 = yes.
global pagerank_weight = 1 // use weighted nodes? 0 = no; 1 = yes.
global pagerank_damping = 0.80 // damping factor (between 0.00 and 1.00), which is the probability that a random surfer clicks on a link on the current page, instead of continuing on another random page. Standard value in computer science = 0.85, but Sorkin (2018, QJE) sets = 1.00.
global pagerank_ties = 1 // 0 = split ties in PageRanks arbitrarily by injecting a small amount of noise; 1 = allow for ties in PageRanks (maybe break them later)

* employer ranks estimation
global ranks_by_categ = 1 // 0 = compute Ranks on pooled data; 1 = compute Ranks separately by ${categ_var}
global ranks_recompute_pageranks = 0 // 0 = do not recompute PageRanks in VALUE_4_RANKS.do; 1 = recompute PageRanks in VALUE_4_RANKS.do

* other parameters
global n_batches = 5 // number of batches to split panel data into (higher numbers take more CPU time but less RAM) for connected set, employer ranks, and labor market parameters

* system options
global saveold_v = 13 // version of Stata to save data files in

* package options
global gtools = "g" // "" = to use Stata-native functions; "g" = use (faster) gtools package


********************************************************************************
* PROGRAMS
********************************************************************************
*** call user-defined function to load programs
do "${DO_DIR}/FUN_PROGRAMS.do"


********************************************************************************
* EXECUTE CODE
********************************************************************************
disp "STARTING ON $S_DATE AT $S_TIME."
display _newline(5)

if ${VALUE_1_CONNECTED} do "${DO_DIR}/VALUE_1_CONNECTED.do"
if ${VALUE_2_AKM} do "${DO_DIR}/VALUE_2_AKM.do"
if ${VALUE_3_PAGERANKS} do "${DO_DIR}/VALUE_3_PAGERANKS.do"


********************************************************************************
* FINAL HOUSEKEEPING
********************************************************************************
timer off 1
timer list 1
disp "FINISHED ON ${S_DATE} AT ${S_TIME} IN A TOTAL OF `=r(t1)' SECONDS."
log close _all
clear all
